import matplotlib.pyplot as plt # low level visualization library
import seaborn as sns # higher level visualization library compared to matplotlib
from sklearn.model_selection import train_test_split # library for splitting data before modeling
from sklearn.linear_model import LinearRegression # library with linear model for machine learning
from sklearn.metrics import mean_absolute_error, r2_score # library for metrics for evaluating results of the model
from yellowbrick.regressor import ResidualsPlot # library for visualizing result of your model. Install through Anaconda terminal using: pip install yellowbrick
import statsmodels.api as sm # library with linear model for statistical inference
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer,PowerTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
# Adjust pandas display and formatting settings
# Remove scientific notations and display numbers with 2 decimal points instead
pd.options.display.float_format = '{:,.5f}'.format
# Increase cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
# Update default style and size of charts
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [20, 10]
# Increase max number of rows and columns to display in pandas tables
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
df=pd.read_csv('concrete.csv')
#There are no predictor variables so I dont have to remove any variable
df.head()
df.shape
#There are no missing values in any of the columns and no garbage data
df.info()
df.describe()
#If skewness is less than -1 or greater than 1, the distribution is highly skewed.
#If skewness is between -1 and -0.5 or between 0.5 and 1, the distribution is moderately skewed.
#If skewness is between -0.5 and 0.5, the distribution is approximately symmetric
# We will have to handle skewness of age and superplastic
df.skew()
profile = df.profile_report(html={'style':{'full_width':True}}) #syntax for v2.6 version of pandas_profiling that works with pandas v1.0+
profile
fieldDescription = {
"cement": "Cement kg in a m3 mixture",
"slag": "Blast Furnace Slag kg in a m3 mixture",
"ash": "Fly Ash kg in a m3 mixture",
"water" :"Water kg in a m3 mixture",
"superplastic":"Superplasticizer kg in a m3 mixture",
"coarseagg":"Coarse Aggregate kg in a m3 mixture",
"fineagg":"Fine Aggregate kg in a m3 mixture",
"age":"Age in Days (1~365)",
"strength":"Concrete compressive strength"
}
def findOutliers(column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
median = df[column].quantile(0.50)
mean=df[column].mean()
IQR = Q3 - Q1
skewed='NotSkewed';
if(mean>median):
skewed='RightSkewed'
elif (mean<median):
skewed='LeftSkewed'
df_new = pd.DataFrame(columns=['Field','Description', 'Total No of Data','Min','Max','Mean','Median(Q2)','Std Dev','Q1','Q3','Total No of Missing','Total Number of Outliers','Skewed','Lowerbound','UpperBound','No of Outliers under lowerbound','No of Outliers above Upperbound'])
df_new=df_new.append({'Field':column,
'Description':fieldDescription[column],
'Total No of Data':df[column].count() ,
'Min':df[column].min() ,
'Max':df[column].max() ,
'Mean':mean,
'Median(Q2)':median,
'Std Dev':df[column].std(),
'Q1':Q1,
'Q3':Q3,
'Total No of Missing':df[column].isnull().sum(),
'Total Number of Outliers': df[(df[column] < (Q1 - 1.5 * IQR)) |(df[column] > (Q3 + 1.5 * IQR))][column].count(),
'Lowerbound': Q1 - 1.5 * IQR,
'UpperBound': Q3 + 1.5 * IQR,
'No of Outliers under lowerbound':df[df[column] < (Q1 - 1.5 * IQR)][column].count(),
'No of Outliers above Upperbound' :df[df[column] > (Q3 + 1.5 * IQR)][column].count(),
'Skewed':skewed
},
ignore_index=True)
return df_new
cement_data=findOutliers('cement')
slag_data=findOutliers('slag')
ash_data=findOutliers('ash')
water_data=findOutliers('water')
superplastic_data=findOutliers('superplastic')
coarseagg_data=findOutliers('coarseagg')
fineagg_data=findOutliers('fineagg')
age_data=findOutliers('age')
strength_data=findOutliers('strength')
pd.concat([cement_data,slag_data,ash_data,water_data,superplastic_data,coarseagg_data,fineagg_data,age_data,strength_data])
As seen above in df.skew(),we need to take care of skewnees in age and superplastic.Rest we will standardize the variable
import matplotlib.pyplot as plt
df[['cement','slag','ash','water','superplastic','coarseagg','fineagg','age','strength']].hist(bins=50,figsize=(15,6));
plt.show();
1)There are no missing values to impute 2)But there are lot of zeroes in slag(471,45.7%),ash(566,56%),superplastic(379,36%).Will have to keep it as is for now and check performance.If not then replace these zeroes with median .If we replace zeroes with medians then we will have to check skewness for these fields again. 3)From skewness,we see age(3.269177) and superplastic(0.907203) are highly skewed.We may have to apply log Transform for these fields to reduce the skewness during cross validation in a pipeline. 4)We will have to apply standardSCaler to all the fields except the above two during cross validation in a pipeline 5)Lets check correlation in Bivariate ANalysis to see if there are any independent fields correlated
sns.pairplot(df)
sns.heatmap(df.corr(),annot=True)
#None of the variable has a strong relation with Strength which is a Target variable.Only Cement seems to have a moderate relationship with Strength
corr_matrix = df.corr()
corr_matrix['strength'].sort_values(ascending=False)
1)None of the variable has a strong relation with Strength which is a Target variable.Only Cement seems to have a moderate relationship with Strength 2)Water is moderately correlated to superplastic and fineagg
X=df.drop('strength',axis=1)
y=df['strength']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=7)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
X_train_split,X_valid,y_train_split,y_valid = train_test_split(X_train,y_train,test_size=0.2,random_state=7)
X_train_split.shape,X_valid.shape,y_train_split.shape,y_valid.shape
X_train.info()
#Backward Elimination
cols = list(X_train_split.columns)
pmax = 1
while (len(cols)>0):
p= []
X_1 = X_train_split[cols]
X_1 = sm.add_constant(X_1)
model = sm.OLS(y_train_split,X_1).fit()
p = pd.Series(model.pvalues.values[1:],index = cols)
pmax = max(p)
feature_with_p_max = p.idxmax()
if(pmax>0.05):
cols.remove(feature_with_p_max)
else:
break
selected_features_BE = cols
print(selected_features_BE)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.data import wine_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
# Build Lin Reg to use in feature selection
linR = LinearRegression()
# Sequential Forward Selection
sfs = SFS(linR,
k_features=(1, 8),
forward=True,
floating=False,
scoring='neg_mean_squared_error',
cv=5,
n_jobs=-1)
pipe = make_pipeline(StandardScaler(), sfs)
pipe.fit(X_train_split, y_train_split)
print('\nSequential Forward Selection :')
print('best combination (ACC: %.3f): %s\n' % (sfs.k_score_, sfs.k_feature_idx_))
#print('all subsets:\n', sfs.subsets_)
#plot_sfs(sfs.get_metric_dict(), kind='std_err');
###################################################
# Sequential Backward Selection
sbs = SFS(linR,
k_features=(1, 8),
forward=False,
floating=False,
scoring='neg_mean_squared_error',
cv=5,
n_jobs=-1)
pipe = make_pipeline(StandardScaler(), sbs)
pipe.fit(X_train_split, y_train_split)
print('\nSequential Backward Selection :')
print('best combination (ACC: %.3f): %s\n' % (sbs.k_score_, sbs.k_feature_idx_))
#plot_sfs(sbs.get_metric_dict(), kind='std_err');
###################################################
# Sequential Forward Floating Selection
sffs = SFS(linR,
k_features=(1, 8),
forward=True,
floating=True,
scoring='neg_mean_squared_error',
cv=5,
n_jobs=-1)
pipe = make_pipeline(StandardScaler(), sffs)
pipe.fit(X_train_split, y_train_split)
print('\nSequential Forward Floating Selection :')
print('best combination (ACC: %.3f): %s\n' % (sffs.k_score_, sffs.k_feature_idx_))
#plot_sfs(sffs.get_metric_dict(), kind='std_err');
###################################################
# Sequential Backward Floating Selection
sbfs = SFS(linR,
k_features=(1, 8),
forward=False,
floating=True,
scoring='neg_mean_squared_error',
cv=5,
n_jobs=-1)
sbfs = sbfs.fit(X_train_split, y_train_split)
print('\nSequential Backward Floating Selection :')
print('best combination (ACC: %.3f): %s\n' % (sbfs.k_score_, sbfs.k_feature_idx_))
print(X_train_split.columns[0])
#plot_sfs(sbfs.get_metric_dict(), kind='std_err');
X_train_split.columns
Based on the FeatureSelection Method,['cement', 'slag', 'ash', 'water', 'superplastic', 'age'] are important and 'coarseagg', 'fineagg' are removed
from sklearn.svm import LinearSVR
from sklearn.feature_selection import RFE
estimator=LinearRegression()
selector = RFE(estimator, n_features_to_select=5, step=1)
regr = make_pipeline(StandardScaler(),selector)
pipeselector = regr.fit(X_train_split, y_train_split)
print(pipeselector.named_steps['rfe'].ranking_)
from sklearn.linear_model import LassoCV
reg = LassoCV()
reg.fit(X_train_split, y_train_split)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X_train_split,y_train_split))
coef = pd.Series(reg.coef_, index = X.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables")
imp_coef = coef.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model")
1)Based on all the 4 methods above ['cement', 'slag', 'ash', 'water', 'superplastic', 'age'] are important and 'coarseagg', 'fineagg' are not that important
2)Will have to run the model with all the fields and without 2 fields and check the performance of the model
1)Select Model based on all the fields and check cv score
2)Select model based on only ['cement', 'slag', 'ash', 'water', 'superplastic', 'age'] based on FeatureSelection and check cv score
3)Select Model based on PolynomialFeature as part of Feature extraction and check cv score
import warnings; warnings.simplefilter('ignore')
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoLars
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
preprocessor = make_pipeline(
#SimpleImputer(strategy = 'median'),
PowerTransformer(method = 'yeo-johnson', standardize = False),
StandardScaler()
#RobustScaler()
)
def quick_eval(pipeline, X_train, y_train,X_test,y_test , verbose=True):
"""
Quickly trains modeling pipeline and evaluates on test data. Returns original model, training RMSE, and testing
RMSE as a tuple.
"""
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
#clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
scores =cross_val_score(pipeline, X_train, y_train, cv=cv,scoring='neg_mean_squared_error')
#print(f"Regression algorithm: {pipeline.named_steps['regressor'].__class__.__name__}")
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
return pipeline.named_steps['regressor'].__class__.__name__,scores.mean(), scores.std() * 2
regressors = [
LinearRegression(),
Lasso(alpha=.5),
Ridge(alpha=.1),
LassoLars(alpha=.1),
DecisionTreeRegressor(),
RandomForestRegressor(),
AdaBoostRegressor(),
GradientBoostingRegressor()
]
# Creating an empty Dataframe with column names only
resultsDfwithallColumns = pd.DataFrame(columns=['Method', 'MSE All columns', 'STD Dev'])
for r in regressors:
pipe = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', r)
])
eval_scores=quick_eval(pipe, X_train_split, y_train_split,X_valid,y_valid)
resultsDfwithallColumns = resultsDfwithallColumns.append({'Method': eval_scores[0],
'MSE All columns': eval_scores[1],
'STD Dev': eval_scores[2]
},ignore_index=True)
#print()
# Store performance scores of the model in a dataframe for comparison
resultsDfwithallColumns
featurestoSelect =['cement', 'slag', 'ash', 'water', 'superplastic', 'age']
X_train_split[featurestoSelect]
# Creating an empty Dataframe with column names only
resultsDfwithSelectedColumns = pd.DataFrame(columns=['Method', 'MSE Selected columns', 'STD Dev'])
for r in regressors:
pipe = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', r)
])
eval_scores=quick_eval(pipe, X_train_split[featurestoSelect], y_train_split,X_valid[featurestoSelect],y_valid)
resultsDfwithSelectedColumns = resultsDfwithSelectedColumns.append({'Method': eval_scores[0],
'MSE Selected columns': eval_scores[1],
'STD Dev': eval_scores[2]
},ignore_index=True)
#print()
resultsDfwithSelectedColumns
result = pd.concat([resultsDfwithallColumns, resultsDfwithSelectedColumns.drop('Method',axis=1)], axis=1, join='inner')
result
# Creating an empty Dataframe with column names only
resultsDfwithPolyColumns = pd.DataFrame(columns=['Method', 'MSE Polynomial columns', 'STD Dev'])
preprocessor = make_pipeline(
#SimpleImputer(strategy = 'median'),
PolynomialFeatures(degree=3),
PowerTransformer(method = 'yeo-johnson', standardize = False)
#StandardScaler()
#RobustScaler()
)
for r in regressors:
pipe = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', r)
])
eval_scores=quick_eval(pipe, X_train_split, y_train_split,X_valid,y_valid)
resultsDfwithPolyColumns = resultsDfwithPolyColumns.append({'Method': eval_scores[0],
'MSE Polynomial columns': eval_scores[1],
'STD Dev': eval_scores[2]
},ignore_index=True)
#print()
resultsDfwithPolyColumns
result = pd.concat([result, resultsDfwithPolyColumns.drop('Method',axis=1)], axis=1, join='inner')
result
1)Based on the above table,I will be using RandomForestRegressor and GradientBoostingRegressor as my initial model to tune
df.head()
#slag(471,45.7%),ash(566,56%),superplastic(379,36%).
len(df[(df['superplastic']==0) | (df['ash']==0) | (df['slag']==0)])
#slag(471,45.7%),ash(566,56%),superplastic(379,36%).
len(df[(df['superplastic']==0) & (df['ash']==0) & (df['slag']==0)])
We cant randomly replace the above zeroes with mean as there are lot of zeroes and we cant drop the rows as we will loose lot of data. So we will have to keep it as is and tune our models. Best way to replace the zeroes is to collect the data from the source.
#!pip install bayesian-optimization
#from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error
def evaluate_model(model):
mse_train = mean_squared_error(y_train_split, model.predict(X_train_split))
mse_test = mean_squared_error(y_valid, model.predict(X_valid))
return mse_train, mse_test
from sklearn.model_selection import GridSearchCV
preprocessor = make_pipeline(
#SimpleImputer(strategy = 'median'),
PowerTransformer(method = 'yeo-johnson', standardize = False),
StandardScaler()
#RobustScaler()
)
selector_pipeline = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(random_state=20,n_estimators=20))
])
selector_pipeline.fit(X_train_split, y_train_split);
modeltuningresults = pd.DataFrame(columns=['Method', 'MSE Train','MSE Valid'])
scores = evaluate_model(selector_pipeline)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "Base RandomForestRegressor",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
selector_pipeline = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(random_state=20,n_estimators=200))
])
modeltuningresults
from sklearn.model_selection import GridSearchCV
selector_pipeline = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(random_state=20,n_estimators=200))
])
param_grid = {
'regressor__max_features': [5,6,7,8],
'regressor__max_depth': [3,4,5,6,7,8,9,10],
'regressor__n_estimators':[50,100,150,200]
}
gridsearch = GridSearchCV(selector_pipeline,
param_grid,
scoring =
'neg_mean_squared_error',
cv = 5,
n_jobs = -1,
verbose = True)
gridsearch.fit(X_train_split, y_train_split);
print(f"The best estimator had RMSE {(gridsearch.best_score_)} and the following parameters:")
print(f"The best estimator had RMSE {np.sqrt(-gridsearch.best_score_)} and the following parameters:")
print(gridsearch.best_params_)
scores = evaluate_model(gridsearch.best_estimator_)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "GridSearchCV RandomForestRegressor",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
selector_pipeline = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(random_state=20,n_estimators=200))
])
modeltuningresults
from sklearn.model_selection import RandomizedSearchCV
selector_pipeline = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(random_state=20,n_estimators=200))
])
param_grid = {
'regressor__max_features': range(3,8),
'regressor__max_depth': range(1,5),
'regressor__n_estimators':range(1,50)
}
gridsearch = RandomizedSearchCV(selector_pipeline,
param_grid,
scoring =
'neg_mean_squared_error',
cv = 5,
n_jobs = -1,
verbose = True)
gridsearch.fit(X_train_split, y_train_split);
print(f"The RandomizedSearchCV best estimator had RMSE {(gridsearch.best_score_)} and the following parameters:")
print(f"The RandomizedSearchCV best estimator had RMSE {np.sqrt(-gridsearch.best_score_)} and the following parameters:")
print(gridsearch.best_params_)
scores = evaluate_model(gridsearch.best_estimator_)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "RandomizedSearchCV RandomForestRegressor",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
from bayes_opt import BayesianOptimization
# Define the model
def rf_bo(max_depth, n_estimators, max_features):
rf = RandomForestRegressor(n_estimators = int(n_estimators),
max_depth = int(max_depth),
max_features = int(max_features),
random_state = 2)
rf.fit(X_train_split, y_train_split)
return mean_squared_error(y_train_split, rf.predict(X_train_split))
# Set a hyperparameter search space
parameters={'max_depth' : (1,10),
'n_estimators': (10,100),
'max_features': (1,8)}
# Run a hyperparameter seach
BO = BayesianOptimization(rf_bo, parameters, random_state=1)
BO.maximize()
BO.max
# Random Forest Classifier with tuned parameters
rfbo = RandomForestRegressor(max_depth = 3, max_features = 3, n_estimators = 46, random_state = 20).fit(X_train_split, y_train_split)
# Create models scores for train and test sets
scores = evaluate_model(rfbo)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "Bayesian Optimization RandomForestRegressor",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
from sklearn.model_selection import GridSearchCV
selector_pipeline = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', GradientBoostingRegressor(random_state=20,n_estimators=20))
])
selector_pipeline.fit(X_train_split, y_train_split);
scores = evaluate_model(selector_pipeline)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "Base GradientBoostingRegressor",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
from sklearn.model_selection import GridSearchCV
selector_pipeline = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', GradientBoostingRegressor(random_state=20,n_estimators=200))
])
param_grid = {
'regressor__max_features': [5,6,7,8],
'regressor__max_depth': [3,4,5,6,7,8,9,10],
'regressor__n_estimators':[50,100,150,200],
'regressor__learning_rate':[0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ]
}
gridsearch = GridSearchCV(selector_pipeline,
param_grid,
scoring =
'neg_mean_squared_error',
cv = 5,
n_jobs = -1,
verbose = True)
gridsearch.fit(X_train_split, y_train_split);
print(f"The best estimator had RMSE {(gridsearch.best_score_)} and the following parameters:")
print(f"The best estimator had RMSE {np.sqrt(-gridsearch.best_score_)} and the following parameters:")
print(gridsearch.best_params_)
scores = evaluate_model(gridsearch.best_estimator_)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "GridSearchCV GradientBoostingRegressor",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
from sklearn.model_selection import RandomizedSearchCV
selector_pipeline = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', GradientBoostingRegressor(random_state=20,n_estimators=200))
])
param_grid = {
'regressor__max_features': list(range(1,8)),
'regressor__max_depth': list(range(1,10)),
'regressor__n_estimators':list(range(1,100)),
'regressor__learning_rate':[0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ]
}
gridsearch = RandomizedSearchCV(selector_pipeline,
param_grid,
scoring =
'neg_mean_squared_error',
cv = 5,
n_jobs = -1,
verbose = True)
gridsearch.fit(X_train_split, y_train_split);
print(f"The best estimator had RMSE {(gridsearch.best_score_)} and the following parameters:")
print(f"The best estimator had RMSE {np.sqrt(-gridsearch.best_score_)} and the following parameters:")
print(gridsearch.best_params_)
scores = evaluate_model(gridsearch.best_estimator_)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "RandomizedSearchCV GradientBoostingRegressor",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
from sklearn import datasets
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from scipy.stats import uniform
from xgboost import XGBRegressor
# Instantiate an XGBRegressor with default hyperparameter settings
xgb = XGBRegressor()
# and compute a baseline to beat with hyperparameter optimization
baseline = cross_val_score(xgb, X_train_split, y_train_split, scoring='neg_mean_squared_error').mean()
baseline
# Hyperparameters to tune and their ranges
param_dist = {"learning_rate": uniform(0, 1),
"gamma": uniform(0, 5),
"max_depth": range(1,50),
"n_estimators": range(1,300),
"min_child_weight": range(1,10)}
rs = RandomizedSearchCV(xgb, param_distributions=param_dist,
scoring='neg_mean_squared_error', n_iter=25)
# Run random search for 25 iterations
rs.fit(X_train_split, y_train_split);
print(f"The best estimator had RMSE {(rs.best_score_)} and the following parameters:")
print(f"The best estimator had RMSE {np.sqrt(-rs.best_score_)} and the following parameters:")
print(rs.best_estimator_)
scores = evaluate_model(rs.best_estimator_)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "RandomizedSearchCV XGBRegressor",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
selector_pipeline = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', LinearRegression())
])
rf= selector_pipeline.fit(X_train_split,y_train_split)
scores = evaluate_model(rf)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "LinearRegression",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
selector_pipeline = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', Ridge())
])
rf= selector_pipeline.fit(X_train_split,y_train_split)
scores = evaluate_model(rf)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "RidgeRegression",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
selector_pipeline = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', Lasso())
])
rf= selector_pipeline.fit(X_train_split,y_train_split)
scores = evaluate_model(rf)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "LASSO",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
from sklearn.tree import DecisionTreeRegressor
selector_pipeline = Pipeline(steps = [
('preprocessor', preprocessor),
('regressor', DecisionTreeRegressor())
])
rf= selector_pipeline.fit(X_train_split,y_train_split)
scores = evaluate_model(rf)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "DecisionTreeRegressor",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
np.random.uniform(0,1)
scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer(),PowerTransformer()]
alpha_to_test = 2.0**np.arange(-6, +6)
alpha_to_test
from sklearn.model_selection import GridSearchCV
params = [
{'scaler': scalers_to_test,
'regressor__alpha': alpha_to_test
}
]
pipe = Pipeline([
('scaler', StandardScaler()),
('regressor', Lasso())
])
gridsearch = GridSearchCV(pipe, params, verbose=1,cv=5).fit(X_train_split, y_train_split)
print(gridsearch.best_estimator_)
selector_pipeline = Pipeline([
('scaler',
QuantileTransformer(copy=True, ignore_implicit_zeros=False,
n_quantiles=1000,
output_distribution='uniform',
random_state=None, subsample=100000)),
('regressor',
Lasso(alpha=0.015625, copy_X=True, fit_intercept=True,
max_iter=1000, normalize=False, positive=False,
precompute=False, random_state=None, selection='cyclic',
tol=0.0001, warm_start=False))
])
rf= selector_pipeline.fit(X_train_split,y_train_split)
scores = evaluate_model(rf)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "Lasso GridSearch",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
from sklearn.model_selection import GridSearchCV
preprocessor = make_pipeline(
#SimpleImputer(strategy = 'median'),
PolynomialFeatures(degree=3),
PowerTransformer(method = 'yeo-johnson', standardize = False)
#StandardScaler()
#RobustScaler()
)
params = [
{
'regressor__alpha': alpha_to_test
}
]
pipe = Pipeline([
('scaler', preprocessor),
('regressor', Ridge())
])
gridsearch = GridSearchCV(pipe, params, verbose=1,cv=5).fit(X_train_split, y_train_split)
print(gridsearch.best_estimator_)
selector_pipeline = Pipeline([
('scaler',
Pipeline(memory=None,
steps=[('polynomialfeatures',
PolynomialFeatures(degree=3,
include_bias=True,
interaction_only=False,
order='C')),
('powertransformer',
PowerTransformer(copy=True,
method='yeo-johnson',
standardize=False))],
verbose=False)),
('regressor',
Ridge(alpha=0.015625, copy_X=True, fit_intercept=True,
max_iter=None, normalize=False, random_state=None,
solver='auto', tol=0.001))
])
rf= selector_pipeline.fit(X_train_split,y_train_split)
scores = evaluate_model(rf)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "Ridge Polynomial GridSearch",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
preprocessor = make_pipeline(
#SimpleImputer(strategy = 'median'),
PolynomialFeatures(degree=3),
PowerTransformer(method = 'yeo-johnson', standardize = False)
#StandardScaler()
#RobustScaler()
)
params = [
{
'regressor__alpha': alpha_to_test
}
]
pipe = Pipeline([
('scaler', preprocessor),
('regressor', Ridge())
])
gridsearch = RandomizedSearchCV(pipe, params, verbose=1,cv=5).fit(X_train_split, y_train_split)
print(gridsearch.best_estimator_)
selector_pipeline = Pipeline([
('scaler',
Pipeline(memory=None,
steps=[('polynomialfeatures',
PolynomialFeatures(degree=3,
include_bias=True,
interaction_only=False,
order='C')),
('powertransformer',
PowerTransformer(copy=True,
method='yeo-johnson',
standardize=False))],
verbose=False)),
('regressor',
Ridge(alpha=0.015625, copy_X=True, fit_intercept=True,
max_iter=None, normalize=False, random_state=None,
solver='auto', tol=0.001))
])
rf= selector_pipeline.fit(X_train_split,y_train_split)
scores = evaluate_model(rf)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "Ridge Polynomial RandomSearchCV",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
from sklearn.model_selection import GridSearchCV
preprocessor = make_pipeline(
#SimpleImputer(strategy = 'median'),
PolynomialFeatures(degree=3),
PowerTransformer(method = 'yeo-johnson', standardize = False)
#StandardScaler()
#RobustScaler()
)
params = [
{
'regressor__alpha': alpha_to_test
}
]
pipe = Pipeline([
('scaler', preprocessor),
('regressor', Lasso())
])
gridsearch = GridSearchCV(pipe, params, verbose=1,cv=5).fit(X_train_split, y_train_split)
print(gridsearch.best_estimator_)
selector_pipeline = Pipeline([
('scaler',
Pipeline(memory=None,
steps=[('polynomialfeatures',
PolynomialFeatures(degree=3,
include_bias=True,
interaction_only=False,
order='C')),
('powertransformer',
PowerTransformer(copy=True,
method='yeo-johnson',
standardize=False))],
verbose=False)),
('regressor',
Lasso(alpha=0.015625, copy_X=True, fit_intercept=True,
max_iter=1000, normalize=False, positive=False,
precompute=False, random_state=None, selection='cyclic',
tol=0.0001, warm_start=False))
])
rf= selector_pipeline.fit(X_train_split,y_train_split)
scores = evaluate_model(rf)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "Lasso Polynomial GridSearch",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
featurestoSelect
from sklearn.metrics import mean_squared_error
def evaluate_model_featuresToSelect(model,featurestoSelect):
mse_train = mean_squared_error(y_train_split, model.predict(X_train_split[featurestoSelect]))
mse_test = mean_squared_error(y_valid, model.predict(X_valid[featurestoSelect]))
return mse_train, mse_test
from sklearn.model_selection import GridSearchCV
preprocessor = make_pipeline(
#SimpleImputer(strategy = 'median'),
PolynomialFeatures(degree=3),
PowerTransformer(method = 'yeo-johnson', standardize = False)
#StandardScaler()
#RobustScaler()
)
params = [
{
'regressor__alpha': alpha_to_test
}
]
pipe = Pipeline([
('scaler', preprocessor),
('regressor', Ridge())
])
gridsearch = GridSearchCV(pipe, params, verbose=1,cv=5).fit(X_train_split[featurestoSelect], y_train_split)
print(gridsearch.best_estimator_)
selector_pipeline = Pipeline([
('scaler',
Pipeline(memory=None,
steps=[('polynomialfeatures',
PolynomialFeatures(degree=3,
include_bias=True,
interaction_only=False,
order='C')),
('powertransformer',
PowerTransformer(copy=True,
method='yeo-johnson',
standardize=False))],
verbose=False)),
('regressor',
Ridge(alpha=0.015625, copy_X=True, fit_intercept=True,
max_iter=None, normalize=False, random_state=None,
solver='auto', tol=0.001))
])
rf= selector_pipeline.fit(X_train_split[featurestoSelect],y_train_split)
scores = evaluate_model_featuresToSelect(rf,featurestoSelect)
# Store performance scores of the model in a dataframe for comparison
modeltuningresults = modeltuningresults.append({'Method': "Ridge Polynomial GridSearch with Selected Features",
'MSE Train':scores[0],
'MSE Valid' :scores[1]
},ignore_index=True)
modeltuningresults
1)Initially RandomForestRegressor and GradientBoostRegressor gave good results after performing Cross_val_score but then when we tested the results
with ValidationData then we realised these complex models didnt perform goodon validation data.There was good amount of overfit
2)So i started tuning the model using simple algorithm like LinearRegression ,Ridge,Lasso etc and found they did better on validation data
3)When Polynomial Feature was applied on Ridge Regression then it performed really well all the other algorithm
SO ,will be chosing Ridge Polynomial Regression with all the columns as my final model to run on test data.
X_train_split.shape,y_train_split.shape,X_valid.shape,y_valid.shape,X_test.shape,y_test.shape
selector_pipeline = Pipeline([
('scaler',
Pipeline(memory=None,
steps=[('polynomialfeatures',
PolynomialFeatures(degree=3,
include_bias=True,
interaction_only=False,
order='C')),
('powertransformer',
PowerTransformer(copy=True,
method='yeo-johnson',
standardize=False))],
verbose=False)),
('regressor',
Ridge(alpha=0.015625, copy_X=True, fit_intercept=True,
max_iter=None, normalize=False, random_state=None,
solver='auto', tol=0.001))
])
rf= selector_pipeline.fit(X_train_split,y_train_split)
scores= evaluate_model(rf)
mse_test = mean_squared_error(y_test, rf.predict(X_test))
print(f"Train RMSE: {scores[0]}")
print(f"Valid RMSE: {scores[1]}")
print(f"Test RMSE: {mse_test}")
Ridge Regression on Test Data performed well with all the columns and polynomial features !!!